QSRR revisited. Hierarchical models

Author
Affiliation

Paweł Wiczling*

Department of Biopharmaceutics and Pharmacodynamics, Medical University of Gdańsk, Gen. J. Hallera 107, 80-416 Gdańsk, Poland

Published

April 23, 2025

Introductions

Setup

Code
library(ggplot2)
library(gridExtra)
library(patchwork)
library(tidyr)
library(dplyr)
library(GGally)
library(reshape2)
library(pracma)
library(here)
library(magick)
library(reticulate)
library(kableExtra)
require(visNetwork, quietly = TRUE)
library(igraph)
#remotes::install_github("metrumresearchgroup/mrgmisc")

set.seed(10271998) ## not required but assures repeatable results

source("helper-functions.R")
Code
data_deliv_dir = here::here("data","deliv")
data_derived_dir = here::here("data","derived")
if(!file.exists(data_deliv_dir)) dir.create(data_deliv_dir, recursive = T)
if(!file.exists(data_derived_dir)) dir.create(data_derived_dir, recursive = T)

figures_eda_dir <- here::here("deliv","figures", "eda")
tables_eda_dir <- here::here("deliv","tables", "eda")
if(!file.exists(figures_eda_dir)) dir.create(figures_eda_dir, recursive = T)
if(!file.exists(tables_eda_dir)) dir.create(tables_eda_dir, recursive = T)

model_dir <- here::here("model","stan")  
figures_dir <- here::here("deliv","figures", "stan")
tables_dir <- here::here("deliv","tables", "stan")
if(!file.exists(figures_dir)) dir.create(figures_dir, recursive = T)
if(!file.exists(model_dir)) dir.create(model_dir, recursive = T)
if(!file.exists(tables_dir)) dir.create(tables_dir, recursive = T)

EDA

We used a publicly available dataset that comprises the measurements of RP-HPLC retention times collected for 1026 analytes. The retention times were measured under isocratic conditions on Eclipse Plus C18 (Agilent) stationary phase with 3.5 μm particles. The experiments were conducted using a mixture of two solvents: solvent A, which was made of 0.1% formic acid in water, and solvent B, which was made of 0.1% formic acid in acetonitrile. The column temperature was set at 35^{}C. The data were collected by Boswell et al. and were used to create a method to predict retention time by Back-Calculating the Gradient.

Load data

Code
data <- read.csv(here(data_deliv_dir, "database_logk_1026.csv"), header = TRUE)
analytes_names  <- read.csv(here(data_deliv_dir,"database_logk_1026_analyte_names.csv"), header = TRUE)
smiles <- read.csv(here(data_deliv_dir,"smiles1026.smi"), sep = "\t", header = FALSE)
functional_groups = read.csv(here(data_deliv_dir, 'checkmol_functional_groups.csv'))
functional_groups_names = read.csv(here(data_deliv_dir,'checkmol_functional_group_names.csv'))
data_ACD = read.csv(here(data_deliv_dir,'ACD_pKas.csv'))
data_pH <- read.csv(here(data_deliv_dir,"pH.csv"),header = TRUE)
results <-read.csv(here(data_deliv_dir,"results_table.csv"))

#similarity_matrix <- make_similarity_matrix_fun(lower_tri_df)
smiles<-smiles %>% rename(ID=V2,smiles=V1) %>% select(ID,smiles)
smiles$smiles[905] = "CN(C1CCCCC1N1CCCC1)C(=O)Cc1ccc(c(c1)Cl)Cl" # remove tartrate moiety 
smiles$smiles[425] = "CC(Cc1ccc(cc1)OCC(=O)O)NCC(c1cccc(c1)Cl)O" # remove Na+ and dissociation

Python

Code
use_condaenv("rdkit-env", conda = "C:/Users/GUMed/anaconda3/Scripts/conda.exe", required = TRUE)
Code
Chem <- import("rdkit.Chem")
AllChem <- import("rdkit.Chem.AllChem")
Draw <- import("rdkit.Chem.Draw")
rdFMCS <- import("rdkit.Chem.rdFMCS")
rdmolops <- import("rdkit.Chem.rdmolops")

# Create MCS parameters
mcs_params <- rdFMCS$MCSParameters()
mcs_params$AtomCompareParameters$MatchValences <- TRUE 
mcs_params$AtomCompareParameters$RingMatchesRingOnly <- TRUE  
mcs_params$AtomCompareParameters$Compare <- rdFMCS$AtomCompare$CompareElements
mcs_params$BondCompareParameters$CompleteRingsOnly <- TRUE 
mcs_params$BondCompareParameters$Compare <- rdFMCS$BondCompare$CompareOrder
mcs_params$MaximizeBonds <- TRUE
mcs_params$Timeout <- 20L           
#mcs_params$Verbose <- FALSE
mcs_params$Threshold <- 0.5 

Maximum Common Substructure

Code
smiles_vec <- smiles$smiles
mols <- lapply(smiles_vec, function(smi) Chem$MolFromSmiles(smi))

n <- length(mols)
pair_count <- choose(n, 2)

# Pre-allocate result list
results_list <- vector("list", pair_count)
pair_idx <- 1

# Faster nested loop with list output
for (i in 1:(n - 1)) {
  mol1 <- mols[[i]]
  for (j in (i + 1):n) {
    mol2 <- mols[[j]]
    
    # Skip if any molecule is NULL
    if (is.null(mol1) || is.null(mol2)) {
      mcs_size <- NA
    } else {
      mcs_result <- rdFMCS$FindMCS(list(mol1, mol2), parameters = mcs_params)
      mcs_size <- mcs_result$numAtoms
    }

    results_list[[pair_idx]] <- list(
      mol1_index = i,
      mol2_index = j,
      mcs_size = mcs_size
    )
    pair_idx <- pair_idx + 1
  }
}

# Convert to data.frame once
results <- do.call(rbind, lapply(results_list, as.data.frame))

write.csv(results, file = here(data_deliv_dir,"results_table.csv"), row.names = FALSE)

Subsets

Code
# Function to get number of atoms from SMILES
get_atom_count <- function(smiles) {
  mol <- Chem$MolFromSmiles(smiles)
  return(mol$GetNumAtoms())
}

atom_count <- unlist(lapply(smiles$smiles, get_atom_count))
results$mol1_count = atom_count[results$mol1_index]
results$mol2_count = atom_count[results$mol2_index]

results_selected <- results %>%
  mutate(crit = mol1_count + mol2_count - 2*mcs_size)%>%
  group_by(mol1_index) %>%
  slice(which.min(crit))%>%
  filter(crit<12)

MCS Subsets

Code
extract_common_fragments <- function(mol, match_atoms) {
  
editable_mol <- Chem$RWMol(mol)
mcs_atom_indices <- as.integer(match_atoms)

# Remove all atoms not in MCS (do it in reverse to preserve indices)
all_indices <- 0:(mol$GetNumAtoms() - 1)
atoms_to_remove <- setdiff(all_indices, mcs_atom_indices)

for (idx in rev(atoms_to_remove)) {
  editable_mol$RemoveAtom(as.integer(idx))
}

# Get final molecule
common_mol <- editable_mol

num_atoms <- common_mol$GetNumAtoms()
 for (i in seq(0, num_atoms - 1)) {
     atom <- common_mol$GetAtomWithIdx(i)
     if (!atom$IsInRing() && atom$GetIsAromatic()) {
         atom$SetIsAromatic(FALSE)
     }
 }

 res <- try({Chem$SanitizeMol(common_mol)}, silent = TRUE)
  
  if (inherits(res, "try-error")) {
  common_smiles <- Chem$MolToSmiles(common_mol)
  common_smiles = paste0("Err:",common_smiles)
  } else {
  Chem$SanitizeMol(common_mol)
  common_smiles <- Chem$MolToSmiles(common_mol)
  }

return(common_smiles)
}

extract_attachemnt_string <- function(mol, match_atoms) {
  
  all_atoms <- seq_len(mol$GetNumAtoms()) - 1L
  diff_atoms <- setdiff(all_atoms, match_atoms)
  
attachment_strings <- c()
for (atom_idx in match_atoms) {
  atom <- mol$GetAtomWithIdx(atom_idx)
  atom_symbol <- atom$GetSymbol()

  for (bond in atom$GetBonds()) {
    nbr_idx <- bond$GetOtherAtomIdx(atom_idx)
    if (!(nbr_idx %in% match_atoms)) {
      nbr_atom <- mol$GetAtomWithIdx(nbr_idx)
      
      
      
      nbr_symbol <- nbr_atom$GetSymbol()
      bond_type <- as.character(bond$GetBondType())

      attachment_strings <- c(
        attachment_strings,
        sprintf("%d:%s %d:%s %s",
                atom_idx, atom_symbol,
                nbr_idx, nbr_symbol,
                bond_type)
      )
    }
  }
}
return(paste(attachment_strings, collapse = "."))
}

extract_diff_fragments <- function(mol, match_atoms) {
  
  all_atoms <- seq_len(mol$GetNumAtoms()) - 1L
  diff_atoms <- setdiff(all_atoms, match_atoms)
  
  if (length(diff_atoms) == 0) return("")
  rw_mol <- Chem$RWMol(mol)
  
  # Set atoms to remove (in reverse order to preserve indices)
  for (idx in sort(unlist(match_atoms), decreasing = TRUE)) {
    rw_mol$RemoveAtom(idx)
  }

  res <- try({
    mol_frags <- rdmolops$GetMolFrags(rw_mol, asMols = TRUE)
    }, silent = TRUE)
  
  if (inherits(res, "try-error")) {
  mol_frags <- rdmolops$GetMolFrags(rw_mol, asMols = TRUE, sanitizeFrags = FALSE)
  } else {
  mol_frags <- rdmolops$GetMolFrags(rw_mol, asMols = TRUE)
  }

  frag_smiles <- sapply(mol_frags, function(frag) as.character(Chem$MolToSmiles(frag)))
  return(paste(frag_smiles, collapse = "."))
}

compare_smiles_pair <- function(smiles1, smiles2, .mcs_params=mcs_params) {
  
  mol1 <- Chem$MolFromSmiles(smiles1)
  mol2 <- Chem$MolFromSmiles(smiles2)

  if (is.null(mol1) || is.null(mol2)) {
    return(data.frame(
    smile1 = smiles1,
    smile2 = smiles2,
    common = NA,
    to_remove = NA,
    to_remove_str = NA,
    to_add = NA,
    to_add_str = NA,
    structure = "<em>Invalid SMILES</em>"
    ))
  }

 mcs_result <- rdFMCS$FindMCS(list(mol1, mol2), parameters = mcs_params)
 smarts <- mcs_result$smartsString
 common_mol <- Chem$MolFromSmarts(smarts)

  # Get atom matches for highlighting
  match1 <- mol1$GetSubstructMatch(common_mol)
  match2 <- mol2$GetSubstructMatch(common_mol)

  # Compute unmatched SMILES
  to_remove <- extract_diff_fragments(mol1, match1)
  to_add <- extract_diff_fragments(mol2, match2)

  to_remove_str <- extract_attachemnt_string(mol1, match1)
  to_add_str <- extract_attachemnt_string(mol2, match2)
  
  common_smile <- extract_common_fragments(mol1, match1)
  
  # Generate 2D coordinates
  AllChem$Compute2DCoords(mol1)
  AllChem$Compute2DCoords(mol2)
    
  # Draw image
  img <- Draw$MolsToGridImage(
    list(mol1, mol2),
    highlightAtomLists = list(match1, match2),
    subImgSize = tuple(300L, 300L),
    legends = list("Mol 1", "Mol 2")
  )

  # Save to temp file and encode as base64 HTML img tag
  img_file <- tempfile(fileext = ".png")
  img$save(img_file)
  img_base64 <- base64enc::dataURI(file = img_file, mime = "image/png")

  
  gc()
  py_run_string("import gc; gc.collect()")
 # img_magick <- image_read(img_file)
 # print(img_magick)  # Displays in RStudio viewer or default graphics device

  # Return data frame row
  data.frame(
    smile1 = smiles1,
    smile2 = smiles2,
    common = common_smile,
    to_remove = to_remove,
    to_remove_str= to_remove_str,
    to_add = to_add,
    to_add_str=to_add_str,
    structure = sprintf('<img src="%s" style="width:600px; height:auto;"/>', img_base64), 
    stringsAsFactors = FALSE)
}

# Extract the vectors
rows <- smiles$smiles[results_selected$mol1_index]
cols <- smiles$smiles[results_selected$mol2_index]

comparison_table <- purrr::map2_df(rows, cols, compare_smiles_pair,
                                   .progress = list(
  type = "iterator", 
  format = "Calculating {cli::pb_bar} {cli::pb_percent} {cli::pb_current}/{cli::pb_total}",
  clear = FALSE))

write.csv(comparison_table, file = here(data_deliv_dir,"comparison_table.csv"), row.names = FALSE)

Add membership groups

Code
uid = unique(c(results_selected$mol1_index,results_selected$mol2_index))
nodes <- data.frame(id = uid,label = paste(uid))
edges <- data.frame(from = results_selected$mol1_index,
                    to = results_selected$mol2_index, 
                    length=results_selected$crit*10,
                    label = paste(results_selected$crit))
g <- graph_from_data_frame(d = edges, vertices = nodes, directed = TRUE)
components <- clusters(g, mode = "weak")
nodes$group <- components$membership

results_selected <- results_selected%>%
  mutate(id=mol1_index) %>%
  left_join(nodes) %>%
  mutate(group = if_else(is.na(group), 999, group))

Visualise Network

Code
visNetwork(nodes, edges, width = "100%") %>% 
  visLegend(width = 0.1, position = "right", main = "Group")

Test

Code
# Extract the vectors
rows <- smiles$smiles[1]
cols <- smiles$smiles[824]
comparison_table <- purrr::map2_df(rows, cols, compare_smiles_pair) %>%
  select( "structure") %>%
  kbl(escape = FALSE, format = "html", align = "l") %>%
  kable_styling(full_width = FALSE, position = "center", bootstrap_options = c("striped", "hover", "condensed"))

Session info

Code
sessionInfo()
R version 4.3.1 (2023-06-16 ucrt)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 10 x64 (build 19045)

Matrix products: default


locale:
[1] LC_COLLATE=Polish_Poland.utf8  LC_CTYPE=Polish_Poland.utf8   
[3] LC_MONETARY=Polish_Poland.utf8 LC_NUMERIC=C                  
[5] LC_TIME=Polish_Poland.utf8    

time zone: Europe/Warsaw
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] igraph_2.1.4      visNetwork_2.1.2  kableExtra_1.4.0  reticulate_1.42.0
 [5] magick_2.8.6      here_1.0.1        pracma_2.4.4      reshape2_1.4.4   
 [9] GGally_2.2.1      dplyr_1.1.4       tidyr_1.3.1       patchwork_1.2.0  
[13] gridExtra_2.3     ggplot2_3.5.1    

loaded via a namespace (and not attached):
 [1] utf8_1.2.4         generics_0.1.3     xml2_1.3.6         stringi_1.8.4     
 [5] lattice_0.21-8     digest_0.6.35      magrittr_2.0.3     evaluate_1.0.3    
 [9] grid_4.3.1         RColorBrewer_1.1-3 fastmap_1.2.0      rprojroot_2.0.4   
[13] plyr_1.8.9         jsonlite_1.8.8     Matrix_1.6-5       purrr_1.0.2       
[17] fansi_1.0.6        viridisLite_0.4.2  scales_1.3.0       textshaping_0.4.0 
[21] cli_3.6.2          rlang_1.1.3        munsell_0.5.1      withr_3.0.2       
[25] yaml_2.3.8         tools_4.3.1        colorspace_2.1-0   ggstats_0.8.0     
[29] png_0.1-8          vctrs_0.6.5        R6_2.5.1           lifecycle_1.0.4   
[33] stringr_1.5.1      htmlwidgets_1.6.4  ragg_1.3.2         pkgconfig_2.0.3   
[37] pillar_1.9.0       gtable_0.3.5       glue_1.7.0         Rcpp_1.0.12       
[41] systemfonts_1.1.0  xfun_0.44          tibble_3.2.1       tidyselect_1.2.1  
[45] rstudioapi_0.16.0  knitr_1.46         htmltools_0.5.8.1  svglite_2.1.3     
[49] rmarkdown_2.27     compiler_4.3.1